In [1]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

from plotly.offline import init_notebook_mode, iplot
import plotly.figure_factory as ff
import plotly.graph_objs as go
from plotly import tools

warnings.simplefilter('ignore')
pd.options.display.max_rows = 100
init_notebook_mode(connected=True)
%matplotlib inline

Load Dataset

In [2]:
train = pd.read_csv('./data/train_clean.csv')
test = pd.read_csv('./data/test_clean.csv')

print('Train:')
print(train.info(verbose=False), '\n')
print('Test:')
print(test.info(verbose=False))
Train:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 722142 entries, 0 to 722141
Columns: 68 entries, loan_amnt to credit_length
dtypes: float64(16), int64(41), object(11)
memory usage: 374.6+ MB
None 

Test:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98727 entries, 0 to 98726
Columns: 68 entries, loan_amnt to credit_length
dtypes: float64(16), int64(41), object(11)
memory usage: 51.2+ MB
None

Data Basic Information.

In [3]:
# imbalanced dataset
target1 = train['target'].sum()
target0 = (1 - train['target']).sum()

print('Target 0:\t', target0, '\t', np.round(target0 / len(train), 4))
print('Target 1:\t', target1, '\t', np.round(target1 / len(train), 4))
print('0/1 Ratio:\t', np.round(target0 / target1, 4))
Target 0:	 549951 	 0.7616
Target 1:	 172191 	 0.2384
0/1 Ratio:	 3.1938
In [4]:
# visualize the target count distribution
data = [go.Bar(x=['status 0'], y=[target0], name='Status 0'), 
        go.Bar(x=['status 1'], y=[target1], name='Status 1')]
margin=go.layout.Margin(l=50, r=50, b=30, t=40, pad=4)
legend = dict(orientation='h', xanchor='auto', y=-0.2)

layout = go.Layout(title='Loan Status Count Plot', xaxis=dict(title='Loan Status'), 
                   yaxis=dict(title='Count'), autosize=False, width=700, height=400, 
                   margin=margin, legend=legend)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

Visualization

In [5]:
# define categorical and numerical features
cat_features = ['term', 'home_ownership', 'verification_status', 'purpose', 
                'title', 'addr_state', 'initial_list_status', 'application_type', 
                'grade', 'sub_grade']

num_features = ['loan_amnt', 'int_rate', 'installment_ratio', 'emp_length', 'annual_inc', 
                'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 
                'revol_bal', 'revol_util', 'total_acc', 'collections_12_mths_ex_med', 
                'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim', 
                'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 
                'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct', 
                'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 
                'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_inq',
                'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl',
                'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl',
                'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats', 'num_tl_120dpd_2m',
                'num_tl_30dpd', 'num_tl_90g_dpd_24m', 'num_tl_op_past_12m', 
                'pct_tl_nvr_dlq', 'percent_bc_gt_75', 'pub_rec_bankruptcies',
                'tax_liens', 'tot_hi_cred_lim', 'total_bal_ex_mort', 'total_bc_limit',
                'total_il_high_credit_limit', 'credit_length']

features = cat_features + num_features

# define numerical and categorical features
print('Categorical feature:\t', len(cat_features))
print('Numerical feature:\t', len(num_features))
print('Total feature:\t\t', len(features))
Categorical feature:	 10
Numerical feature:	 54
Total feature:		 64

2. Numerical Variables

In [6]:
def numerical_plot(data, feature, width=800, height=400, bins=50):
    """ function to plot the numerical variable """
    # make subplots
    titles = ('Histogram Plot', 'Default Rate vs. ' + feature.capitalize())
    fig = tools.make_subplots(rows=1, cols=2, print_grid=False, subplot_titles=titles)
    
    # fig 1: histogram for different loan status
    x0 = data[data['target']==0][feature]
    x1 = data[data['target']==1][feature]
    
    # find the minimum and maximum values
    start = min(x0.min(), x1.min())
    end = max(x0.max(), x1.max())
    n_unique = len(data[feature].unique())
    if n_unique <= min(end - start + 1, bins):
        bin_size = 1
    else:
        bin_size = (end - start) / bins

    # Group data together
    hist_data = [x0, x1]
    group_labels = ['Status 0', 'Status 1']

    # Create distplot
    fig1 = ff.create_distplot(hist_data=hist_data, group_labels=group_labels, 
                              bin_size=bin_size, show_rug=False)
    displot = fig1['data']
    
    # add histgram into the final figure
    fig.append_trace(displot[0], 1, 1)
    fig.append_trace(displot[1], 1, 1)
    fig.append_trace(displot[2], 1, 1)
    fig.append_trace(displot[3], 1, 1)
    
    # fig 2: default rate bar plot for each feature or scatter plot
    if n_unique <= bins:
        # default rate bar plot
        means = data.groupby(feature)['target'].mean()
        stds = data.groupby(feature)['target'].std()
        names = list(means.index)
        for name, mean, std in zip(names, means[names], stds[names]):
            low, high = stats.norm.interval(0.05, loc=mean, scale=std)
            er = mean - low
            trace = go.Bar(x=[name], y=[mean], error_y=dict(array=[er], visible=True), 
                           name=name, xaxis='x2')
            fig.append_trace(trace, 1, 2)
    else:
        # scatter plot 
        mean = train.groupby(feature)['target'].mean()
        sem = train.groupby(feature)['target'].sem().fillna(value=0)
        index = mean.index

        lower = go.Scatter(x=index, y=mean[index]-sem[index], mode='lines', 
                           marker=dict(color="#444"), line=dict(width=0), 
                           showlegend=False)
        
        trace = go.Scatter(name='Default Rate', x=index, y=mean[index],  
                           line=dict(color='rgb(31, 119, 180)', width=1), 
                           fillcolor='rgba(68, 68, 68, 0.3)', mode='lines',)
        
        upper = go.Scatter(x=index, y=mean[index]+sem[index], mode='lines', 
                           marker=dict(color="#444"), line=dict(width=0), 
                           fill='tonexty', fillcolor='rgba(68, 68, 68, 0.3)', 
                           showlegend=False)

        fig.append_trace(lower, 1, 2)
        fig.append_trace(upper, 1, 2)
        fig.append_trace(trace, 1, 2)

    # layout setting
    legend = dict(orientation='h', xanchor='auto', y=-0.2)
    margin=go.layout.Margin(l=50, r=50, b=50, t=40, pad=4)
    fig['layout'].update(xaxis=dict(domain=[0, 0.47]), xaxis2=dict(domain=[0.53, 1]),
                         yaxis2=dict(anchor='x2'), width=width, height=height, 
                         margin=margin, legend=legend)
    fig['layout']['xaxis1'].update(title=feature.capitalize())
    fig['layout']['yaxis1'].update(title='Probability Density')
    fig['layout']['xaxis2'].update(title=feature.capitalize())
    fig['layout']['yaxis2'].update(title='Default Rate')

    return fig
In [10]:
# loan_amnt
feature = 'loan_amnt'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [11]:
# int_rate
feature = 'int_rate'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [12]:
# installment_ratio
feature = 'installment_ratio'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [13]:
# emp_length
feature = 'emp_length'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# annual_inc
feature = 'annual_inc'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# dti
feature = 'dti'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# delinq_2yrs
feature = 'delinq_2yrs'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# inq_last_6mths
feature = 'inq_last_6mths'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# open_acc
feature = 'open_acc'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# pub_rec
feature = 'pub_rec'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# revol_bal
feature = 'revol_bal'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# revol_util
feature = 'revol_util'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# total_acc
feature = 'total_acc'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# collections_12_mths_ex_med
feature = 'collections_12_mths_ex_med'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# acc_now_delinq
feature = 'acc_now_delinq'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# tot_coll_amt
feature = 'tot_coll_amt'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# tot_cur_bal
feature = 'tot_cur_bal'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# total_rev_hi_lim
feature = 'total_rev_hi_lim'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# acc_open_past_24mths
feature = 'acc_open_past_24mths'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# avg_cur_bal
feature = 'avg_cur_bal'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# bc_open_to_buy
feature = 'bc_open_to_buy'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# bc_util
feature = 'bc_util'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# chargeoff_within_12_mths
feature = 'chargeoff_within_12_mths'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# delinq_amnt
feature = 'delinq_amnt'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# mo_sin_old_il_acct
feature = 'mo_sin_old_il_acct'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# mo_sin_old_rev_tl_op
feature = 'mo_sin_old_rev_tl_op'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# mo_sin_rcnt_rev_tl_op
feature = 'mo_sin_rcnt_rev_tl_op'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# mo_sin_rcnt_tl
feature = 'mo_sin_rcnt_tl'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# mort_acc
feature = 'mort_acc'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# mths_since_recent_bc
feature = 'mths_since_recent_bc'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# mths_since_recent_inq
feature = 'mths_since_recent_inq'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# num_accts_ever_120_pd
feature = 'num_accts_ever_120_pd'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# num_actv_bc_tl
feature = 'num_actv_bc_tl'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# num_actv_rev_tl
feature = 'num_actv_rev_tl'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# num_bc_sats
feature = 'num_bc_sats'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# num_bc_tl
feature = 'num_bc_tl'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# num_il_tl
feature = 'num_il_tl'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# num_op_rev_tl
feature = 'num_op_rev_tl'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# num_rev_accts
feature = 'num_rev_accts'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# num_rev_tl_bal_gt_0
feature = 'num_rev_tl_bal_gt_0'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# num_sats
feature = 'num_sats'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# num_tl_120dpd_2m
feature = 'num_tl_120dpd_2m'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# num_tl_30dpd
feature = 'num_tl_30dpd'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# num_tl_90g_dpd_24m
feature = 'num_tl_90g_dpd_24m'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# num_tl_op_past_12m
feature = 'num_tl_op_past_12m'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# pct_tl_nvr_dlq
feature = 'pct_tl_nvr_dlq'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# percent_bc_gt_75
feature = 'percent_bc_gt_75'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# pub_rec_bankruptcies
feature = 'pub_rec_bankruptcies'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# tax_liens
feature = 'tax_liens'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# tot_hi_cred_lim
feature = 'tot_hi_cred_lim'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# total_bal_ex_mort
feature = 'total_bal_ex_mort'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# total_bc_limit
feature = 'total_bc_limit'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# total_il_high_credit_limit
feature = 'total_il_high_credit_limit'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)
In [ ]:
# credit_length
feature = 'credit_length'
fig = numerical_plot(data=train, feature=feature, width=1000, height=450, bins=50)
iplot(fig)